package com.asiainfo.zqx;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class CaiJinjie extends BreadthCrawler {
    private String seedurl="http://www.zgcjj.cn/list1cj.htm";
    private String regurl="http://www.zgcjj.cn/News/.*";
    public CaiJinjie(String crawlPath) {
        super(crawlPath, false);
        addSeed(seedurl);
        setThreads(1);
       // setResumable(true);
    }

    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {
        String contentType=page.contentType();
        if(contentType==null){
            return;
        }
        if(page.matchType("text")&&page.select("div.navFunction>ul>li").first()!=null&&page.select("div.navFunction>ul>li").first().text().contains("财经界")){
            System.out.println("连接为"+page.url());
            System.out.println("标题为"+page.select("div.panelText>h1").text());
            Element element=page.select("div.navFunction>ul>li").first();
            if(element.text()!=null){
                System.out.println("作者为"+element.text());
            }else{
                System.out.println("作者为财经界");
            }
            Element element1=page.select("div.navFunction>ul>li").get(1);
            System.out.println("时间为"+element1.text().substring(3));
            System.out.println("来源为：财经界");
            System.out.println("正文为"+page.select("div#matterc>p").text());
        }else{
            Elements elements=page.select("a");
            for (Element element:elements) {
                if(element.attr("abs:href").matches(regurl)){
                    crawlDatums.add(new CrawlDatum(element.attr("abs:href"),"text"));
                }

            }
        }
    }

    public static void main(String[] args) throws Exception {
        CaiJinjie caijin=new CaiJinjie("caijin");
        caijin.start(2);
    }
}
